Allow shifting instead of masks if the types are legal. #162732

AZero13 · 2025-10-09T20:39:32Z

No description provided.

llvmbot · 2025-10-09T21:01:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: AZero13 (AZero13)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/162732.diff

3 Files Affected:

(modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5-2)
(modified) llvm/test/CodeGen/AMDGPU/extract-lowbits.ll (+29-18)
(modified) llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll (+6-8)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 88691b931a8d8..9865143edbdfb 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -839,8 +839,11 @@ class LLVM_ABI TargetLoweringBase {
   /// Return true if the variant with 2 variable shifts is preferred.
   /// Return false if there is no preference.
   virtual bool shouldFoldMaskToVariableShiftPair(SDValue X) const {
-    // By default, let's assume that no one prefers shifts.
-    return false;
+    // By default, let's assume that no one prefers shifts for vectors
+    EVT VT = X.getValueType();
+
+    // Prefer shifts for legal types
+    return isOperationLegal(ISD::SHL, VT);
   }
 
   /// Return true if it is profitable to fold a pair of shifts into a mask.
diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
index 89bd5f1dc7969..59bc1dd019de3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
@@ -103,16 +103,16 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT:    v_and_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT:    v_and_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
@@ -121,12 +121,23 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 }
 
 define i32 @bzhi32_c0_clamp(i32 %val, i32 %numlowbits) nounwind {
-; GCN-LABEL: bzhi32_c0_clamp:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 31, v1
-; GCN-NEXT:    v_bfe_u32 v0, v0, 0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: bzhi32_c0_clamp:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 31, v1
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: bzhi32_c0_clamp:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v1, 31, v1
+; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
   %low5bits = and i32 %numlowbits, 31
   %numhighbits = sub i32 32, %low5bits
   %mask = lshr i32 -1, %numhighbits
@@ -139,16 +150,16 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT:    v_and_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c1_indexzext:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u16_e32 v1, 32, v1
-; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT:    v_and_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
@@ -162,16 +173,16 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_lshr_b32_e32 v1, -1, v1
-; SI-NEXT:    v_and_b32_e32 v0, v0, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bzhi32_c4_commutative:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v1
-; VI-NEXT:    v_lshrrev_b32_e64 v1, v1, -1
-; VI-NEXT:    v_and_b32_e32 v0, v0, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
diff --git a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
index 5b21a3652d11c..1055fc101a6c5 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.extract-lowbits.ll
@@ -266,7 +266,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -279,9 +279,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
 ; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.W, literal.x, PV.W,
-; EG-NEXT:    -1(nan), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.X, PV.W, KC0[2].Y,
+; EG-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
+; EG-NEXT:     LSHR T0.X, PV.W, T0.W,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
@@ -289,7 +288,7 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
 ; CM-NEXT:    TEX 0 @6
-; CM-NEXT:    ALU 8, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 7, @9, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
@@ -302,9 +301,8 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, ptr add
 ; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; CM-NEXT:     AND_INT * T0.W, PV.W, literal.x,
 ; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T0.W, literal.x, PV.W,
-; CM-NEXT:    -1(nan), 0(0.000000e+00)
-; CM-NEXT:     AND_INT * T0.X, PV.W, KC0[2].Y,
+; CM-NEXT:     LSHL * T1.W, KC0[2].Y, PV.W,
+; CM-NEXT:     LSHR * T0.X, PV.W, T0.W,
 ; CM-NEXT:     LSHR * T1.X, KC0[2].W, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %numhighbits = sub i8 32, %numlowbits

arsenm · 2025-10-10T02:25:07Z

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 31, v1
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0


Regression, broke a BFE pattern?

arsenm · 2025-10-10T02:25:46Z

llvm/test/CodeGen/AMDGPU/extract-lowbits.ll

+; SI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v1, v0


This is a neutral change at best

jayfoad · 2025-10-10T10:34:00Z

Why? I don't see any tests that show a benefit. I guess it could conceivably reduce register pressure, since the mask has to be materialized in a register, but that would only help if the shift amount has more uses, and there aren't any tests cases for that.

AZero13 closed this Oct 9, 2025

AZero13 reopened this Oct 9, 2025

Allow shifting instead of masks if the types are legal.

7a9cac2

AZero13 force-pushed the shifts branch from c044f13 to 7a9cac2 Compare October 9, 2025 21:00

llvmbot added the backend:AMDGPU label Oct 9, 2025

arsenm reviewed Oct 10, 2025

View reviewed changes

AZero13 closed this Oct 14, 2025

AZero13 deleted the shifts branch October 14, 2025 20:15

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Allow shifting instead of masks if the types are legal. #162732

Allow shifting instead of masks if the types are legal. #162732

Uh oh!

AZero13 commented Oct 9, 2025

Uh oh!

llvmbot commented Oct 9, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

arsenm Oct 10, 2025

Uh oh!

jayfoad commented Oct 10, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

		; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
		; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0

Allow shifting instead of masks if the types are legal. #162732

Allow shifting instead of masks if the types are legal. #162732

Uh oh!

Conversation

AZero13 commented Oct 9, 2025

Uh oh!

llvmbot commented Oct 9, 2025

Uh oh!

arsenm Oct 10, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Oct 10, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad commented Oct 10, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants